%config InlineBackend.figure_formats = ["retina"]
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import scanpy as sc
from scipy import stats
from more_itertools import unique_everseen
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.n_jobs = 16
import anndata2ri
anndata2ri.activate()
%reload_ext rpy2.ipython
plt.rcParams.update({"font.family":"Reem Kufi"})
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
heatcolors = matplotlib.colors.LinearSegmentedColormap.from_list("", ["gray", "#000004FF", "#330A5FFF", "#781C6DFF","#BB3754FF",
"#ED6925FF", "#FCB519FF", "#FCFFA4FF"])
heatcolors_wr = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "#FFF5F0", "#FEE0D2", "#FCBBA1", "#FC9272",
"#FB6A4A", "#EF3B2C", "#CB181D", "#A50F15", "#67000D"])
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":8,"axes.titlesize":14})
%%R
suppressWarnings(suppressMessages(require(scran)))
suppressWarnings(suppressMessages(require(scater)))
suppressWarnings(suppressMessages(library(DropletUtils)))
suppressWarnings(suppressMessages(library(batchelor)))
options(mc.cores = 24L)
%%R
# Load the individual data
# P5
do26386 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do26386_count/outs/filtered_feature_bc_matrix/")
colData(do26386)$Sample <- rep("P5", ncol(do26386))
colData(do26386)$Library <- rep("do26386", ncol(do26386))
do26387 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do26387_count/outs/filtered_feature_bc_matrix/")
colData(do26387)$Sample <- rep("P5", ncol(do26387))
colData(do26387)$Library <- rep("do26387", ncol(do26387))
# P10
do17821 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17821_count/outs/filtered_feature_bc_matrix/")
colData(do17821)$Sample <- rep("P10", ncol(do17821))
colData(do17821)$Library <- rep("do17821", ncol(do17821))
# P15
do18195 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do18195_count/outs/filtered_feature_bc_matrix/")
colData(do18195)$Sample <- rep("P15", ncol(do18195))
colData(do18195)$Library <- rep("do18195", ncol(do18195))
# P20
do17824 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17824_count/outs/filtered_feature_bc_matrix/")
colData(do17824)$Sample <- rep("P20", ncol(do17824))
colData(do17824)$Library <- rep("do17824", ncol(do17824))
# P25
do18196 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do18196_count/outs/filtered_feature_bc_matrix/")
colData(do18196)$Sample <- rep("P25", ncol(do18196))
colData(do18196)$Library <- rep("do18196", ncol(do18196))
# P30
do17825 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17825_count/outs/filtered_feature_bc_matrix/")
colData(do17825)$Sample <- rep("P30", ncol(do17825))
colData(do17825)$Library <- rep("do17825", ncol(do17825))
# P35
do17827 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17827_count/outs/filtered_feature_bc_matrix/")
colData(do17827)$Sample <- rep("P35", ncol(do17827))
colData(do17827)$Library <- rep("do17827", ncol(do17827))
# B6
do17815 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17815_count/outs/filtered_feature_bc_matrix/")
colData(do17815)$Sample <- rep("B6", ncol(do17815))
colData(do17815)$Library <- rep("do17815", ncol(do17815))
do17816 <- read10xCounts("/home/projects/11001611/working/mahesh/RNA_SEQ_WORKFOLDER/Mouse_Spermatogenesis/Ernst_Odom/Data/do17816_count/outs/filtered_feature_bc_matrix/")
colData(do17816)$Sample <- rep("B6", ncol(do17816))
colData(do17816)$Library <- rep("do17816", ncol(do17816))
%%R -o ms_sce
# Merge all datasets
ms_sce <- cbind(do26386, do26387, do17821, do18195, do17824, do18196, do17825, do17827, do17815, do17816)
rownames(ms_sce) <- rowData(ms_sce)$Symbol
print(ms_sce)
%%R
bcrank <- barcodeRanks(counts(ms_sce))
# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex.lab=1.2)
abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2)
legend("bottomleft", legend=c("Knee"),
col=c("dodgerblue"), lty=2, cex=1.2)
%%R
MT.Genes <- grep("^mt-", rownames(ms_sce), value = T)
print(MT.Genes)
%%R
# Calculate QC metrics
ms_sce <- addPerCellQC(ms_sce, subsets = list(mt=MT.Genes))
# Remove low quality cells i.e., those with less than 200 genes expressed and high mt%
ms_sce <- ms_sce[,ms_sce$detected > 200 & ms_sce$subsets_mt_percent < 20]
# Remove unxpressed genes
ms_sce <- ms_sce[Matrix::rowSums(counts(ms_sce)) > 0, ]
%%R
print(ms_sce)
%%R
# Normalization
set.seed(55063)
clusters <- quickCluster(ms_sce)
ms_sce <- computeSumFactors(ms_sce, cluster = clusters)
ms_sce <- logNormCounts(ms_sce)
set.seed(55063)
mod_ms <- modelGeneVar(ms_sce)
hvgs_ms <- getTopHVGs(mod_ms, prop = 0.1)
length(hvgs_ms)
%%R
# Dimensionality reduction
set.seed(55063)
ms_sce <- runPCA(ms_sce, subset_row = hvgs_ms)
ms_sce <- runTSNE(ms_sce, dimred = "PCA", num_threads = 24, verbose = F)
%%R -o ms_sce
# Perform Batch correction
# Identify HVGs based on sample
set.seed(55063)
mod_ms_batch <- modelGeneVar(ms_sce, block = colData(ms_sce)$Sample)
hvgs_ms_batch <- getTopHVGs(mod_ms_batch, prop = 0.1)
set.seed(55063)
ms_sce_corrected <- fastMNN(ms_sce, subset.row = hvgs_ms_batch, batch=ms_sce$Sample)
set.seed(55063)
ms_sce_corrected <- runTSNE(ms_sce_corrected, dimred = "corrected", perplexity=300, num_threads = 12, verbose = F)
reducedDim(ms_sce, "mnn") <- reducedDim(ms_sce_corrected, "corrected")
reducedDim(ms_sce, "mnnTSNE") <- reducedDim(ms_sce_corrected, "TSNE")
%%R
length(hvgs_ms_batch)
ms_sce.var_names_make_unique()
ms_sce.obs['Sample'].value_counts()
ms_sce.obs['sum'].describe()
ms_sce.obs['detected'].describe()
sc.pl.tsne(ms_sce, color = ["Sample"],
size = 10,
edgecolor = "black",
linewidths = 0.1,
legend_fontsize = 8, title="Before batch correction")
ms_sce.obsm['X_tsne'] = ms_sce.obsm['mnnTSNE']
sc.pl.tsne(ms_sce, color = ["Sample"],
size = 10,
edgecolor = "black",
linewidths = 0.1, title="After batch correction",
legend_fontsize = 8)
ms_sce.obsm["X_pca"] = ms_sce.obsm["mnn"]
sc.pp.neighbors(ms_sce, n_neighbors = 5)
sc.tl.leiden(ms_sce, resolution = 2, key_added = 'leiden_2')
sc.pl.tsne(ms_sce, color=["leiden_2"],
size=8,
legend_loc = "on data",
edgecolor = "black",
linewidths = 0.1,
ncols = 3,
legend_fontsize = 8,
alpha=0.2)
wv = sc.pl.stacked_violin(ms_sce, groupby="leiden_2", var_names=["Ddx4", "Utf1", "Uchl1", "Id4", "Rec8", "Stra8", "Aurka", "Acrv1",
"Prm2", "Vim", "Amh", "Cd74", "Vwf", "Acta2", "Tagln", "Clu", "Sox9",
"Insl3", "Inhba", "Dcn", "Sparc"],
swap_axes=True, layer="logcounts")
ms_leiden_clusters = ms_sce.obs['leiden_2']
%%R -i ms_leiden_clusters
ms_sce$ms_leiden_clusters = ms_leiden_clusters
ms_sce$broadcluster <- as.character(ms_sce$ms_leiden_clusters)
%%R
ms_sce$broadcluster[ms_sce$broadcluster %in% c("7", "0", "16", "27", "2", "54", "9", "21", "41", "38", "12", "49", "43", "46", "52", "1", "48", "53")] = "Somatic"
ms_sce$broadcluster[ms_sce$broadcluster %in% c("17", "51", "42", "47", "40", "55", "56", "59", "57", "58", "45", "50")] = "Outliers"
ms_sce$broadcluster[!ms_sce$broadcluster %in% c("Outliers", "Somatic")] = "Germ"
%%R
# Seperate Germ and somatic cells
ms_sce_gs <- ms_sce[,ms_sce$broadcluster %in% c("Germ", "Somatic")]
ms_sce_gs <- ms_sce_gs[Matrix::rowSums(counts(ms_sce_gs)) > 0, ]
assay(ms_sce_gs, "normcounts") <- as(((2^logcounts(ms_sce_gs))-1), "dgCMatrix")
ms_sce_gs$ms_leiden_clusters <- as.character(ms_sce_gs$ms_leiden_clusters)
%%R -o ms_sce_gs
print(ms_sce_gs)
ms_sce_gs.obs.index = ms_sce_gs.obs['Barcode'].astype('str')+'-'+ms_sce_gs.obs['Sample'].astype('str')
ms_sce_gs.var_names_make_unique()
ms_sce_gs.obsm['X_tsne']=ms_sce_gs.obsm['mnnTSNE']
gsconditions = [
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["1", "52"])),#Myoid
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["0", "2", "7", "9", "16", "21", "27"])),#Immature Leydig
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["41", "38", "12", "49", "43"])),#Leydig
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["54"])),#Sertoli
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["48", "53"])),#Macrophage
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["46"])),#Endothelial
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["44", "5", "18"])),#Spermatogonia
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["39", "33", "6", "8", "4", "3", "14", "10", "30", "26", "15", "11", "13"])),#Spermatocytes
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["19", "37", "20", "24", "31", "22", "29", "36"])),#Round spermatids
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["34", "25", "23", "28","32"])),#Elongating Spermatids
(ms_sce_gs.obs["ms_leiden_clusters"].isin(["35"]))]#Sperm
gsgroups = ["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial", "Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"]
gs_clusters = np.select(gsconditions, gsgroups, default="Germ")
ms_sce_gs.obs["broad_clusters"] = gs_clusters
ms_sce_gs.obs["broad_clusters"] = ms_sce_gs.obs["broad_clusters"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial",
"Spermatogonia", "Spermatocytes","Round spermatids","Elongating spermatids","Sperm"],ordered=True)
ms_sce_gs.obs['ms_leiden_clusters_celltype'] = ms_sce_gs.obs['ms_leiden_clusters'].astype(str) +"-"+ms_sce_gs.obs['broad_clusters'].astype(str)
cols_clusters = ["#FFB292","gold", "lightslategray", "#936C00","lime", "red", "#F16913", "#4C7C5E", "#2171B5","#eaa9bd", "#91357d"]
sc.pl.tsne(ms_sce_gs, color=["broad_clusters"], legend_fontsize=10, palette=cols_clusters,
size=20, edgecolor="black", linewidths=0.1, title="")
sc.tl.dendrogram(ms_sce_gs, groupby="ms_leiden_clusters_celltype")
sc.pl.correlation_matrix(ms_sce_gs, groupby="ms_leiden_clusters_celltype", figsize=(10,6))
sc.tl.rank_genes_groups(ms_sce_gs, "broad_clusters", n_genes = 6000, method = "wilcoxon", layer = "logcounts", use_raw = False)
gslist = {}
for i in ms_sce_gs.obs["broad_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(ms_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by=["logfoldchanges"], ascending=[False]).dropna(axis=0)["names"].to_list()
gslist[i] = genes
for key, value in gslist.items():
#print value
print(key, len([item for item in value if item]))
gs_L1 = []
for i in ms_sce_gs.obs["broad_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(ms_sce_gs, group=i,pval_cutoff=0.01, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
gs_L1.extend(genes)
gs_unique_genes = list(unique_everseen(gs_L1))
len(gs_unique_genes)
gs_mat = sc.pl.matrixplot(ms_sce_gs, gs_unique_genes, groupby="broad_clusters", figsize=(4, 4),standard_scale="var",
linewidth=.000001,swap_axes=True, cmap=heatcolors_wr, dendrogram=False, layer="logcounts",
show=False, show_gene_labels=False)
%%R -o ms_sce_germ
## Germ
ms_sce_germ <- ms_sce[,ms_sce$broadcluster=="Germ"]
ms_sce_germ <- ms_sce_germ[Matrix::rowSums(counts(ms_sce_germ)) > 0, ]
ms_sce_germ$ms_leiden_clusters <- factor(ms_sce_germ$ms_leiden_clusters)
# Identify HVGs based on sample
set.seed(55063)
mod_ms_batch_germ <- modelGeneVar(ms_sce_germ, block=colData(ms_sce_germ)$Sample)
hvgs_ms_batch_germ <- getTopHVGs(mod_ms_batch_germ, prop=0.1)
print(length(hvgs_ms_batch_germ))
set.seed(55063)
ms_sce_germ_corrected <- fastMNN(ms_sce_germ, subset.row=hvgs_ms_batch_germ, batch=ms_sce_germ$Sample)
reducedDim(ms_sce_germ, "mnn") <- reducedDim(ms_sce_germ_corrected, "corrected")
ms_sce_germ$ms_leiden_clusters <- as.character(ms_sce_germ$ms_leiden_clusters)
ms_sce_germ.obs.index = ms_sce_germ.obs['Barcode'].astype('str')+'-'+ms_sce_germ.obs['Sample'].astype('str')
ms_sce_germ.var_names_make_unique()
ms_sce_germ.obs['sum'].describe()
ms_sce_germ.obs['detected'].describe()
ms_sce_germ.obsm['X_tsne'] = ms_sce_germ.obsm['mnnTSNE']
markers = [g.capitalize() for g in ["UTF1", "ID4","FGFR3", "SOHLH1", "UCHL1","DMRT1", "KIT", "STRA8", "REC8",
"PRDM9","SPO11","DMC1","MEIOB", "BRCA2","ATR","SYCP1","SMC1B", "SMC3","PIWIL1", "PIWIL2", "RAD51",
"SYCP2","SYCP3","HORMAD1","HORMAD2", "AURKA", "PLK1",
"NME5","SPACA1", "ACRV1", "AKAP14","TNP2","PRM2", "CRISP2", "TPPP2", "OAZ3"]]
sc.pl.tsne(ms_sce_germ, color=markers, legend_fontsize=8, color_map=heatcolors,
size=20, edgecolor="black", linewidths=0.1,wspace=0.1, hspace=0.2, ncols=4, layer="logcounts")
ms_sce_germ.obsm["X_pca"] = ms_sce_germ.obsm["mnn"]
ms_sce_germ.obs["ms_leiden_clusters"].cat.reorder_categories(['44', '5', '18', '39', '33', '6', '8', '4', '3',
'14', '10', '30', '26', '15', '11', '13', '19',
'37', '20', '24', '31', '22', '29', '36', '34',
'25', '23', '28','32', '35'], inplace = True)
germADconditions = [
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["44"])),#SPG1
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["5"])),#SPG2
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["18"])),#Diff SPG / pre-Lep
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["39"])),#Lep
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["33"])),#Zyg1
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["6"])),#Zyg2
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["8"])),#Zyg3
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["4"])),#Zyg4
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["3"])),#P1
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["14"])),#P2
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["10"])),#P3
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["30"])),#P4
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["26"])),#P5
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["15"])),#P/D
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["11"])),#D
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["13"])),#Sec. Spermatocytes
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["19"])),#RS1
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["37"])),#RS2
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["20"])),#RS3
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["24"])),#RS4
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["31"])),#RS5
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["22"])),#RS6
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["29"])),#RS7
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["36"])),#RS8
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["34"])),#ES1
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["25"])),#ES2
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["23"])),#ES3
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["28"])),#ES4
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["32"])),#ES5
(ms_sce_germ.obs["ms_leiden_clusters"].isin(["35"]))]#Sperm
germADgroups = ["Undiff1","Undiff2","Diff / pre-Lep", "Lep", "Zyg1", "Zyg2", "Zyg3", "Zyg4", "Pach1", "Pach2",
"Pach3", "Pach4", "Pach5", "Pach6", "Dip", "Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","RS6","RS7","RS8", "ES1","ES2","ES3","ES4","ES5","Sperm"]
germAD_clusters = np.select(germADconditions, germADgroups, default="Germ")
ms_sce_germ.obs["germAD_clusters"] = germAD_clusters
ms_sce_germ.obs["germAD_clusters"] = ms_sce_germ.obs["germAD_clusters"].astype("category").cat.reorder_categories(["Undiff1","Undiff2","Diff / pre-Lep",
"Lep", "Zyg1", "Zyg2", "Zyg3", "Zyg4",
"Pach1", "Pach2", "Pach3", "Pach4", "Pach5", "Pach6",
"Dip", "Sec. Spcs", "RS1","RS2","RS3","RS4","RS5","RS6","RS7","RS8",
"ES1","ES2","ES3","ES4","ES5","Sperm"],ordered=True)
cols_clusters = ["#FEE6CE", "#FDAE6B", "#E6550D",
"#F7FCF5", "#E2ECE2", "#CDDDD0", "#B9CEBE", "#A4BEAC", "#90AF9A", "#7BA088", "#669075", "#528163", "#3D7151", "#29623F", "#14532D", "#00441B",
"#F7FBFF", "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594",
"#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
sc.pl.tsne(ms_sce_germ, color=["germAD_clusters"], size=10, palette=cols_clusters,
edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
ms_sce_germ.obs['Annotated'] = ms_sce_germ.obs['germAD_clusters']
ms_sce_gs.obs['Annotated'] = ms_sce_gs.obs['broad_clusters']
ms_sce_gs.obs['Annotated'] = ms_sce_gs.obs['Annotated'].astype('str')
for idx, x in ms_sce_germ.obs['germAD_clusters'].iteritems():
ms_sce_gs.obs.at[idx, 'Annotated'] = x
ms_sce_gs.obs["Annotated"] = ms_sce_gs.obs["Annotated"].astype("category").cat.reorder_categories(["Myoid", "Immature Leydig", "Leydig", "Sertoli", "Macrophage", "Endothelial",
"Undiff1","Undiff2","Diff / pre-Lep", "Lep", "Zyg1", "Zyg2", "Zyg3", "Zyg4",
"Pach1", "Pach2", "Pach3", "Pach4", "Pach5", "Pach6", "Dip", "Sec. Spcs",
"RS1","RS2","RS3","RS4","RS5","RS6","RS7","RS8",
"ES1","ES2","ES3","ES4","ES5","Sperm"],ordered=True)
palette = ["#FFB292","gold", "lightslategray", "#936C00","lime", "red",
"#FEE6CE", "#FDAE6B", "#E6550D",
"#F7FCF5", "#E2ECE2", "#CDDDD0", "#B9CEBE", "#A4BEAC", "#90AF9A", "#7BA088", "#669075", "#528163", "#3D7151", "#29623F", "#14532D", "#00441B",
"#F7FBFF", "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#084594",
"#f3cbd3", "#eaa9bd", "#dd88ac", "#ca699d", "#b14d8e", "#91357d"]
sc.pl.tsne(ms_sce_gs, color=["Annotated"], size=10, palette=palette,
edgecolor="black", linewidths=0.1, legend_fontsize=8, title="")
sc.pp.neighbors(ms_sce_germ, n_neighbors=5) # PAGA - Partition-based graph abstraction
sc.tl.paga(ms_sce_germ, groups="germAD_clusters") # PAGA - Partition-based graph abstraction
sc.pl.paga(ms_sce_germ, color="germAD_clusters", threshold=0.2, fontsize=8, fontoutline=1, frameon=False)
ms_sce_germ.obs["Annotated"] = ms_sce_germ.obs["germAD_clusters"]
sc.tl.rank_genes_groups(ms_sce_germ, "germAD_clusters", n_genes=6000, method="wilcoxon", layer="logcounts", use_raw=False)
germlist = {}
for i in ms_sce_germ.obs["germAD_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(ms_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25,
key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
germlist[i] = genes
germ_L1 = []
for i in ms_sce_germ.obs["germAD_clusters"].cat.categories:
genes = sc.get.rank_genes_groups_df(ms_sce_germ, group=i,pval_cutoff=0.01, log2fc_min=1.25, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
germ_L1.extend(genes)
germ_unique_genes = list(unique_everseen(germ_L1))
len(germ_unique_genes)
ax = sc.pl.matrixplot(ms_sce_germ, germ_L1, groupby="germAD_clusters", figsize=(10,4),standard_scale="var", linewidth=.000001, swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
ms_sce_germ.X = ms_sce_germ.layers["logcounts"]
# Load gene annnotation file
allgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Mouse/Mouse_genes_annotation_gtf.csv")
allgenes.index = allgenes['gene_name'].tolist()
allgenes = allgenes[['gene_id', 'seqname']]
allgenes.columns = ['GeneStableID', 'Chromosome']
allgenes = allgenes[allgenes['Chromosome'].isin(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y'])]
for ch in allgenes['Chromosome'].astype("category").cat.categories.tolist():
genes = allgenes["Chromosome"]==ch
Agenes = set(allgenes[genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
ms_sce_germ.obs[ch] = np.ravel((ms_sce_germ[:, list(Agenes)].X != 0).sum(axis=1))
AllChr_df = pd.DataFrame()
for column in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'X', 'Y']:
x = ms_sce_germ.obs[column]
AllChr_df[column] = x
AllChr_df['germAD_clusters'] = ms_sce_germ.obs['germAD_clusters']
AllChr_df_melted = AllChr_df.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
AllChr_df_melted_mean = AllChr_df_melted.groupby(["germAD_clusters", "key"], as_index=False).mean()
# Reshape the data
AllChr_df_mean_reshaped = AllChr_df_melted_mean.pivot(index="key",columns="germAD_clusters")
AllChr_df_melted_std = AllChr_df_melted.groupby(["germAD_clusters", "key"]).std().reset_index()
AllChr_df_melted_std_reshaped = AllChr_df_melted_std.pivot(index="key",columns="germAD_clusters")
plt.rcParams["axes.linewidth"] = 0.2
plt.rcParams["figure.figsize"]=6,3
cols = ["#87CEEB", "#83C8EA", "#7FC3E9", "#7BBEE9", "#78B8E8", "#74B3E8",
"#70AEE7", "#6DA8E7", "#69A3E6", "#659EE6", "#6298E5", "#5E93E5",
"#5A8EE4", "#5788E4", "#5383E3", "#4F7EE3", "#4C78E2", "#4873E2",
"#446EE1", "darkorange"]
i=0
for c in ['1', '2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'Y']:#range(23):
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(),
AllChr_df_mean_reshaped.loc[c], linewidth=0.75, color=cols[i], label = c,
alpha=1)
i+=1
plt.plot(AllChr_df_mean_reshaped.iloc[0].index.get_level_values(1).tolist(),
AllChr_df_mean_reshaped.loc['X'], linewidth=1.5, marker='', color= "#C70039", label="X")
# Add legend
plt.legend(loc="upper right", ncol=2, fontsize=6, frameon=False, bbox_to_anchor=(1.15, 1))
# Add titles
plt.ylabel("Mean no. of genes", fontsize=8)
plt.xlabel("")
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor")
plt.tick_params(
axis="both", # changes apply to the x-axis
which="both", # both major and minor ticks are affected
bottom=True, # ticks along the bottom edge are off
top=False, left=True, # ticks along the top edge are off
labelbottom=True, pad=0.5, length=1.5, labelsize=8)
plt.yscale('log', basey=10)
plt.tight_layout()
plt.margins(0.003,0.003)
plt.grid(linestyle='-', linewidth='0.1')
# Retrive chromosomes X, Y, 9 and autosomal genes for X to A ratio
chrX_genes = allgenes["Chromosome"]=="X"
chrY_genes = allgenes["Chromosome"]=="Y"
chr9_genes = allgenes["Chromosome"]=="9"
chrA_genes = allgenes[~allgenes.Chromosome.isin(["X","Y", "MT"])]
# Retrieve the X and Y genes specifically present in the germ cells alone
germ_Xgenes = set(allgenes[chrX_genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
germ_Ygenes = set(allgenes[chrY_genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
germ_9genes = set(allgenes[chr9_genes].index.tolist()).intersection(ms_sce_germ.var_names.tolist())
germ_Agenes = set(chrA_genes.index.tolist()).intersection(ms_sce_germ.var_names.tolist())
ms_sce_germ.obs["Chr X"] = np.mean(
ms_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.mean(ms_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
ms_sce_germ.obs["Chr Y"] = np.mean(
ms_sce_germ[:, list(germ_Ygenes)].X, axis=1).A1 / np.mean(ms_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
ms_sce_germ.obs["Chr 9"] = np.mean(
ms_sce_germ[:, list(germ_9genes)].X, axis=1).A1 / np.mean(ms_sce_germ[:, list(germ_Agenes)].X, axis=1).A1
germXA_data = ms_sce_germ.obs["germAD_clusters"].to_frame().join(ms_sce_germ.obs["Chr 9"]).join(ms_sce_germ.obs["Chr X"])#.join(ms_sce_germ.obs["Chr Y"])
# Melt the data to long shape
germXA_data_melted = germXA_data.melt(id_vars="germAD_clusters", var_name="key", value_name="value")
from scipy import stats
stats.mannwhitneyu(germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Diff / '])) & (germXA_data_melted['key'].isin(['Chr X']))].value,
germXA_data_melted[(germXA_data_melted['germAD_clusters'].isin(['Zyg1'])) & (germXA_data_melted['key'].isin(['Chr X']))].value)
from matplotlib.ticker import FormatStrFormatter
plt.rcParams["figure.figsize"]=6,3
bx = sns.violinplot(x="germAD_clusters", y="value",
hue="key", palette=["#0D8DB3","#C70039"],fliersize=0.1,linewidth=0.5,split=True,scale="count",inner="quartile",
data=germXA_data_melted)
bx.set(xlabel="")
bx.set_ylabel("Chromosome:Autosome ratio", fontsize=8)
bx.set_xticklabels(bx.get_xticklabels(), rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor", visible=True)
bx.grid(linestyle='-', linewidth='0.1')
bx.tick_params(
axis="both", # changes apply to the x-axis
which="major", # both major and minor ticks are affected
bottom=True, # ticks along the bottom edge are off
top=False, left=True, # ticks along the top edge are off
labelbottom=True,labeltop=False, pad=0.5, length=1.5)
bx.axhline(y=0.5,color='gray',linestyle='--')
bx.axhline(y=1,color='gray',linestyle='--')
plt.legend(loc="upper right", fontsize=6, frameon=False)
plt.tight_layout()
ms_sce_germ.obs["ChrX percent"] = (np.sum(
ms_sce_germ[:, list(germ_Xgenes)].X, axis=1).A1 / np.sum(ms_sce_germ.X, axis=1).A1)*100
ms_sce_germ.obs["ChrY percent"] = (np.sum(
ms_sce_germ[:, list(germ_Ygenes)].X, axis=1).A1 / np.sum(ms_sce_germ.X, axis=1).A1)*100
# Make a dataframe using selected columns
germ_dist_data = ms_sce_germ.obs['germAD_clusters'].to_frame().join(ms_sce_germ.obs['detected']).join(ms_sce_germ.obs['sum']).join(ms_sce_germ.obs['subsets_mt_percent']).join(ms_sce_germ.obs['ChrX percent']).join(ms_sce_germ.obs['ChrY percent'])
germ_dist_data.columns = ['germAD_clusters','No. of genes', 'UMI Count', '% Mito', '% ChrX', '% ChrY']
# Melt the data to long shape
germ_dist_data_melted = germ_dist_data.melt(id_vars='germAD_clusters', var_name='key', value_name='value')
g = sns.FacetGrid(germ_dist_data_melted, col="key", height=6, sharex=False, hue="germAD_clusters", aspect=.4, palette=cols_clusters)
g.map(sns.violinplot, "value", "germAD_clusters", label='xxlarge', linewidth=0.01).set_titles("{col_name}").set_axis_labels("","")
sc.pp.neighbors(ms_sce_germ, n_pcs=50, knn=False, method="gauss")# Find the neighbors
sc.tl.diffmap(ms_sce_germ)
ms_sce_germ.uns["iroot"] = np.flatnonzero(ms_sce_germ.obs["germAD_clusters"] == "Undiff1")[0] # Set the root cell
sc.tl.dpt(ms_sce_germ, n_dcs=2)
sc.settings.set_figure_params(dpi=100, dpi_save=300, format="pdf", frameon=True)
sns.set_context("paper", font_scale=1, rc={"font.size":8,"axes.labelsize":12,"axes.titlesize":12})
sc.pl.tsne(ms_sce_germ, color=["dpt_pseudotime"], legend_fontsize=8, cmap= "viridis",
size=10, edgecolor="black", linewidths=0.1) #Use gauss method to find neighbors to get proper pseudotime
# Load chr X gene annnotation file
Xgenes = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/28Feb/Mouse/Mouse_Xgenes_Annotated_updated.csv")
Xgenes.index = Xgenes['gene_name'].tolist()
Xgenes['GeneClass'].value_counts()
Xgenes[['Ancestral', 'Shared']] = Xgenes[['Ancestral','Shared']].fillna(value="NotShared")
germXsingle_sce = ms_sce_germ[:,ms_sce_germ.var['Symbol'][ms_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Single-copy"])].tolist()]
germXsingle_sce.obs_names_make_unique()
germXMulti_sce = ms_sce_germ[:,ms_sce_germ.var['Symbol'][ms_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Multicopy"])].tolist()]
germXMulti_sce.obs_names_make_unique()
germXAmpliconic_sce = ms_sce_germ[:,ms_sce_germ.var['Symbol'][ms_sce_germ.var['Symbol'].isin(Xgenes['gene_name'][Xgenes['GeneClass']=="Ampliconic"])].tolist()]
germXAmpliconic_sce.obs_names_make_unique()
germY_sce = ms_sce_germ[:,list(germ_Ygenes)]
germY_sce.obs_names_make_unique()
sc.tl.rank_genes_groups(germXsingle_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXMulti_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germXAmpliconic_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
sc.tl.rank_genes_groups(germY_sce, "Annotated", n_genes=1000, method="wilcoxon", layer="logcounts", use_raw=False);
singleX_L1 = []
for i in germXsingle_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXsingle_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
singleX_L1.extend(genes)
MultiX_L1 = []
for i in germXMulti_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXMulti_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
MultiX_L1.extend(genes)
Ampli_L1 = []
for i in germXAmpliconic_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germXAmpliconic_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
Ampli_L1.extend(genes)
singleY_L1 = []
for i in germY_sce.obs["Annotated"].cat.categories:
genes = sc.get.rank_genes_groups_df(germY_sce, group=i,log2fc_min=0.5, key="rank_genes_groups").sort_values(by="logfoldchanges", ascending=False).dropna(axis=0)["names"].to_list()
singleY_L1.extend(genes)
singleX_unique = pd.Series(singleX_L1).drop_duplicates().tolist()
MultiX_unique = pd.Series(MultiX_L1).drop_duplicates().tolist()
Ampli_unique = pd.Series(Ampli_L1).drop_duplicates().tolist()
singleY_unique = pd.Series(singleY_L1).drop_duplicates().tolist()
print("No. of single X copy genes: %d" %len(singleX_unique))
print("No. of multi X copy genes: %d" %len(MultiX_unique))
print("No. of ampliconic X genes: %d" %len(Ampli_unique))
print("No. of Y genes: %d" %len(singleY_unique))
XYgenes = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique, 'Y':singleY_unique}
X_sma = {'X Single-copy':singleX_unique, 'X Multi-copy':MultiX_unique, 'X Ampliconic':Ampli_unique}
ax = sc.pl.matrixplot(ms_sce_germ, X_sma, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
ax = sc.pl.matrixplot(ms_sce_germ, {'Multi':MultiX_unique, 'Ampli':Ampli_unique}, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.01,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
Xgenes_ms = Xgenes[Xgenes['gene_name'].isin(ms_sce_germ.var['Symbol'])]
Xgenes_ms['GeneClass'].value_counts()
pd.crosstab(Xgenes_ms['GeneClass'], Xgenes_ms['Shared'])
MultiX_unique_shared = {}
for i in Xgenes_ms['Shared'].astype('category').cat.categories:
genes = [x for x in MultiX_unique if x in Xgenes_ms['gene_name'][(Xgenes_ms['GeneClass']=='Multicopy') & (Xgenes_ms['Shared']==i)].tolist()]
MultiX_unique_shared[i] = genes
ax = sc.pl.matrixplot(ms_sce_germ, MultiX_unique_shared, groupby="Annotated", figsize=(8,7),standard_scale="var", linewidth=.001,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=False)
AmpliX_unique_shared = {}
for i in Xgenes_ms['Shared'].astype('category').cat.categories:
genes = [x for x in Ampli_unique if x in Xgenes_ms['gene_name'][(Xgenes_ms['GeneClass']=='Ampliconic') & (Xgenes_ms['Shared']==i)].tolist()]
AmpliX_unique_shared[i] = genes
ax = sc.pl.matrixplot(ms_sce_germ, AmpliX_unique_shared, groupby="Annotated", figsize=(8,8),standard_scale="var", linewidth=.001,swap_axes=True,
cmap=heatcolors_wr, dendrogram=False, layer="logcounts", show_gene_labels=True)
sc.pl.dotplot(ms_sce_germ,singleY_unique, groupby="Annotated", figsize=(18,6), standard_scale="var",color_map=heatcolors_wr,
dendrogram=False, layer="logcounts")# Y genes
ms_unique = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/17Apr_rev/Mm_unique.txt")
Orthologs = pd.read_csv("/home/projects/11001611/working/prabhakaran/RNASeq/Human_Mouse_Monkey_orthologs_21Feb_clean.txt", sep=" ")
Orthologs = Orthologs[(Orthologs.Crab_eating_macaque_homology_type=="ortholog_one2one") & (Orthologs.Mouse_homology_type=="ortholog_one2one")]
mm_ortho_df = pd.DataFrame(columns=["1:1:1 Othologs", "Mouse_specific", "Other orthologs"])
for i in ms_sce_germ.obs['germAD_clusters'].astype('category').cat.categories.tolist():
df = i+"_df"
df = ms_sce_germ[ms_sce_germ.obs['germAD_clusters'].isin([i]),:];
#df = df[:,list(germ_Xgenes)]
sc.pp.filter_genes(df, min_cells=round((df.n_obs*0.05)));
mm_ortho_df.loc[i] = [len(df.var['ID'][df.var['ID'].isin(Orthologs.Mouse_gene_stable_ID.tolist())])/len(df.var['ID'])*100,
len(df.var['ID'][df.var['ID'].isin(ms_unique['x'].tolist())])/len(df.var['ID'])*100,
len(df.var['ID'][~df.var['ID'].isin(Orthologs.Mouse_gene_stable_ID.tolist()+ms_unique['x'].tolist())])/len(df.var['ID'])*100]
mm_ortho_df['Groups'] = mm_ortho_df.index
plt.rcParams["figure.figsize"]=3,10
mm_ortho_df_melted = mm_ortho_df.melt('Groups', var_name='Type', value_name='value')
g = sns.catplot(x="Groups", y="value", hue='Type', data=mm_ortho_df_melted, kind="point",
palette = ["#DF8F44FF" , "#D9C7A5", "#80796BFF"],s=0.1, height=5, aspect=1.3, scale = 0.6)
plt.xticks(rotation=40, fontsize=8,horizontalalignment="right", rotation_mode="anchor");
plt.xlabel("")
plt.ylabel("% Genes expressed", fontsize=10);